---
title: "data_manip"
author: "Olivier Bergeron-Boutin"
date: "09/01/2022"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = rprojroot::find_rstudio_root_file())
```

# Data and packages 

```{r}
# Loading packages we'll be using
library(tidyverse)
library(magrittr)

# Loading the raw survey data downloaded from Qualtrics 
load("data/can_covid_raw.RData") 

survey_raw %<>% 
  # standard qualtrics output that does not represent a respondent
  slice(-1) %>% 
  filter(
      # only respondents who answered the last Q
      !is.na(conjoint_recall)
  ) %>% 
  rename(id = ResponseId,
         duration = `Duration (in seconds)`)
```

# Collapsing values into single columns

```{r}
survey_raw %<>% 
  dplyr::select(id, contains(c("worry", "outcome", "time"))) %>% 
  pivot_longer(cols = -id,
               names_pattern = "(.*)_(.*)_(.*)",
               names_to = c("item", "qtype", "number")) %>% 
  pivot_wider(-item,
              names_from = c(item, qtype),
              values_from = value) %>% 
  transmute(id = id,
            number = number,
            worry = ifelse(is.na(worry_desktop), worry_mobile, worry_desktop),
            conjoint = ifelse(is.na(outcome_courts), outcome_parliament, outcome_courts),
            time = ifelse(is.na(time_courts), time_parliament, time_courts)) %>% 
  pivot_wider(names_from = number, 
              values_from = c(worry, conjoint, time)) %>% 
  dplyr::select(-conjoint_7, -conjoint_8, -time_7, -time_8) %>% 
  merge(survey_raw, ., by = "id")

survey_raw %<>% 
  dplyr::select(-contains(c("worry_desktop", "worry_mobile",
                            "outcome_parliament", "outcome_courts", 
                            "time_courts", "time_parliament")))

## Outcome for vignette experiment
survey_raw %<>% 
  mutate(
    exp1_outcome = gsub("NA", "", paste0(exp1_1, exp1_2, exp1_3, exp1_4,
                                         exp1_5, exp1_6, exp1_7, exp1_8))
  ) %>% 
  dplyr::select(-exp1_1:-exp1_8)
```

# Recoding

## Party id

```{r}
survey_raw$partyid <- recode(survey_raw$partyid, 
         "None of these" = "Non-partisan") %>% 
  factor(levels = c("Liberal", "Conservative", "NDP", "Green", "Non-partisan"))
```

## Numeric scales 

```{r}
survey_raw %<>% 
  mutate(
    # Anxiety items, coded 0 to 1
    across(c(anxious1_think, anxious2_activities),
           dplyr::recode, "Never" = 0, "Rarely" = 0.33,
           "Sometimes" = 0.66, "Often" = 1),
    # Worry items coded 1 to 7
    across(worry_1:worry_8, 
           dplyr::recode, 
           "1 - Not at all worried" = 1, "2" = 2, "3" = 3, "4" = 4, "5" = 5, 
           "6" = 6, "7 - Extremely worried" = 7),
    # Policy support items, coded 1 to 7
    across(support_1:support_8, 
           dplyr::recode, 
           "1 - Strongly disapprove" = 1, "2" = 2, "3" = 3, "4" = 4, "5" = 5, 
           "6" = 6, "7 - Strongly approve" = 7),
    across(c(social_1:social_5, backslide_1), 
           dplyr::recode, 
           "1 - Strongly disagree" = 1, "2" = 2, "3" = 3, "4" = 4, "5" = 5, 
           "6" = 6, "7 - Strongly agree" = 7),
    across(c(backslide_obstruct, backslide_ignore),
           dplyr::recode, "Strongly disagree" = 1, 
           "Somewhat disagree" = 2, "Neither agree nor disagree" = 3,
           "Somewhat agree" = 4, "Strongly agree" = 5),
    across(deprive_1:deprive_3,
           dplyr::recode, "Not at all" = 1, "2" = 2, "3" = 3, "4" = 4, "5" = 5, 
           "6" = 6, "A great deal" = 7),
    authority1_respect = ifelse(authority1_respect == "Respect for elders", 1, 0),
    authority2_obedience = ifelse(authority2_obedience == "Obedience", 1, 0),
    authority3_behave = ifelse(authority3_behave == "Being well-behaved", 1, 0),
    exp1_outcome = ifelse(exp1_outcome == "Yes", 1, 0),
    can_citizen = ifelse(can_citizen == "Yes", 1, 0),
    attn_check = ifelse(attn_check == "Blue", 1, 0),
    across(c(duration, birth_year),
             as.numeric)
  )
```

## Executive aggrandizement scale

```{r}
# All items 0 to 1
survey_raw %<>% 
  mutate(
    across(c(backslide_obstruct, backslide_ignore),
           ~(.-1)/(5-1)),
    ea_2item = (backslide_obstruct + backslide_ignore) / 2
  )
```


## Anxiety scale

```{r}
survey_raw %<>% 
  mutate(
    anxiety_scale = (anxious1_think + anxious2_activities)/2,
    anxiety_4cat = case_when(
      anxious1_think == 1 | anxious2_activities == 1 ~ "Often",
      (anxious1_think == 0.66 & anxious2_activities != 1) |
        (anxious1_think != 1 & anxious2_activities == 0.66) ~ "Sometimes",
      (anxious1_think == 0.33 & anxious2_activities <= 0.33) |
        (anxious1_think <= 0.33 & anxious2_activities == 0.33) ~ "Rarely",
      anxious1_think == 0 & anxious2_activities == 0 ~ "Never"
    )
  )
```

## Deprivation scale

```{r}
survey_raw %<>% 
  mutate(
    across(deprive_1:deprive_3,
    ~(.-1)/(7-1)),
    deprivation_scale = (deprive_1 + deprive_2 + deprive_3) / 3
  )
```

## Authoritarianism

```{r}
survey_raw %<>% 
  mutate(authority1_isna = is.na(authority1_respect) %>% as.numeric(),
         authority2_isna = is.na(authority2_obedience) %>% as.numeric(),
         authority3_isna = is.na(authority3_behave) %>% as.numeric(),
         authority_na = authority1_isna + authority2_isna + authority3_isna,
         authority_full = ifelse(authority_na == 0, 1, 0))

survey_raw %<>%
  mutate(
    authority1_respect = ifelse(authority1_isna, 0.5, authority1_respect),
    authority2_obedience = ifelse(authority2_isna, 0.5, authority2_obedience),
    authority3_behave = ifelse(authority3_isna, 0.5, authority3_behave),
    authority_scale_alt = (authority1_respect + authority2_obedience + 
                             authority3_behave) / 3,
    authority_scale_alt = ifelse(authority_scale_alt > 0.5, 1, 0)
  )
```


## Birth decades

```{r}
# With decade of birth instead
survey_raw %<>% 
  mutate(birth_decade = case_when(
    birth_year < 1950 ~ "Pre-1950s",
    birth_year >= 1950 & birth_year <= 1959 ~ "1950s",
    birth_year >= 1960 & birth_year <= 1969 ~ "1960s",
    birth_year >= 1970 & birth_year <= 1979 ~ "1970s",
    birth_year >= 1980 & birth_year <= 1989 ~ "1980s",
    birth_year >= 1990 ~ "1990s"
    ) %>% 
      factor(levels = c("Pre-1950s", "1950s", "1960s", "1970s", "1980s", "1990s"))
  )
```

## Education categories

```{r}
survey_raw %<>% 
  mutate(
    educ_cat = case_when(
      education %in% c("No schooling", "Some elementary school", 
                        "Completed elementary school", 
                        "Some secondary/high school") ~ "High school or less",
      education == "Completed secondary/high school" ~ "High school graduate",
      education %in% c("Some technical, community college, classical college", 
                       "Some university") ~ "Some college completed",
      education  == "Bachelor's degree" ~ "College degree",
      education == "Master's degree" ~ "Master's degree",
      education == "Professional degree or doctorate" ~ "PhD/Doctorate"
    ) %>% 
      factor(levels = c("High school or less", "High school graduate",
                        "Some college completed", "College degree",
                        "Master's degree", "PhD/Doctorate")),
    educ_4cat = case_when(
      educ_cat== "High school or less" ~ "Less than HS",
      educ_cat == "High school graduate" ~ "Completed high school",
      educ_cat == "Some college completed" ~ "Some postsecondary",
      educ_cat %in% c("College degree", 
                      "Master's degree", 
                      "PhD/Doctorate") ~ "College graduate"
    ) %>% 
      factor(levels = c("Less than HS", "Completed high school", 
             "Some postsecondary", "College graduate"))
  )
```

## Region

```{r}
survey_raw %<>% 
  mutate(
    region = case_when(
      province %in% c("Manitoba", "Alberta", "Saskatchewan") ~ "Prairies",
      province == "British Columbia" ~ "British Columbia",
      province == "Ontario" ~ "Ontario",
      province %in% c("New Brunswick", "Newfoundland and Labrador",
                      "Nova Scotia", "Prince Edward Island") ~ "Atlantic"
    ) %>% 
      factor(levels = c("Ontario", "Atlantic", "Prairies", 
                        "British Columbia"))
  )
```

## Lockdown preferences

```{r}
survey_raw %<>% 
  mutate(
    covid_lockdowns = dplyr::recode(
      covid_lockdowns, 
      "Lockdowns should be ended immediately" = "Lockdown ended now",
      "Lockdowns should continue until a COVID-19 vaccine is found" =
        "Lockdown until vaccine",
      "Lockdowns should continue until there are fewer COVID-19 deaths" = 
        "Lockdown until fewer deaths"
      )
  )
```

## Exp1 condition

```{r}
# Better names for GOV PID (experiment 1)
survey_raw$gov_pid <- dplyr::recode(survey_raw$gov_pid,
                                "conservative" = "Conservative",
                                "liberal" = "Liberal",
                                "ndp" = "New Democrat",
                                "no_pid" = "No party")

# New variable that contains governor's PID and governor's objective
survey_raw$exp1_condition <- 
  paste(survey_raw$gov_pid, "/", strsplit(survey_raw$gov_group, "_") %>%
          sapply("[", 3), sep = "") %>% 
  factor(
    levels = c("Conservative/lockdown", 
               "Liberal/lockdown", "New Democrat/lockdown", 
               "No party/lockdown", "Conservative/open", 
               "Liberal/open", "New Democrat/open", "No party/open")
  )
```

# Data exclusions

## Straightliners

```{r}
survey_raw %<>% 
  dplyr::select(worry_1:worry_8, support_1:support_8, social_1:social_5, id) %>% 
  gather(key, val, -id) %>% 
  group_by(id) %>% 
  mutate(uniques = length(unique(val))) %>% 
  dplyr::select(id, uniques) %>% 
  distinct() %>% 
  merge(., survey_raw, by = "id") %>% 
  mutate(straightliner = ifelse(uniques == 1, 1, 0))
```


## Subsetting

```{r}
survey_raw %<>% 
  filter(attn_check == 1) %>% 
  filter(duration < 3600) %>% 
  filter(straightliner == 0) %>% 
  filter(duration >= 300) %>% 
  filter(can_citizen == 1)

survey <- survey_raw
```


# Conjoint
```{r}
# Rearranging data
conjoint_gather <- survey_raw %>% 
  gather(key = profile_att, value = attributes, traits1a:traits6b) %>% 
  arrange(EndDate) %>% 
  mutate(task = substring(profile_att, 7, 7) %>% as.numeric(),
         profile = substring(profile_att, 8, 8) %>% dplyr::recode("a" = 1, "b" = 2),
         outcome = paste("conjoint_", task, sep = "")) %>% 
  dplyr::select(-profile_att)

# Dataset of outcomes for each respondent and each task
outcomes <- dplyr::select(survey_raw, conjoint_1:conjoint_6, id) %>% 
  gather(outcome, choice, -id) %>% 
  mutate(choice = dplyr::recode(choice, "Candidate A" = 1, "Candidate B" = 2))

# Merging the two 
conjoint <- merge(conjoint_gather, outcomes, by = c("id", "outcome"), all.x = TRUE)

# Using the attributes column to create variables for individual attributes
conjoint <- do.call(rbind.data.frame, strsplit(conjoint$attributes, "\\|")) %>% 
  dplyr::rename(Gender = 1,
                Age = 2,
                `Political experience` = 3,
                `Party` = 4,
                `Economic aid policy` = 5,
                `Lockdown policy` = 6,
                `Democracy` = 7) %>% 
  cbind(conjoint) %>% 
  mutate(selected = ifelse(profile == choice, 1, 0),
         `Economic aid policy` = dplyr::recode(`Economic aid policy`,
                          "Says economic aid to address the COVID-19 crisis should ensure a basic income of $1,000 per month for everyone" = "Basic income",
                          "Says economic aid to address the COVID-19 crisis should mostly be given to businesses" = "Mostly to businesses",
                          "Says economic aid to address the COVID-19 crisis should mostly be given to workers who have lost their jobs" = "Mostly to workers"),
         `Lockdown policy` = dplyr::recode(`Lockdown policy`,
                          "Says lockdowns should be ended immediately" = "Lockdown ended now",
                          "Says lockdowns should continue until a COVID-19 vaccine is found" = "Lockdown until vaccine",
                          "Says lockdowns should continue until there are fewer COVID-19 deaths" = "Lockdown until fewer deaths"),
         `Democracy` = dplyr::recode(`Democracy`, "Says that a prime minister should comply with court decisions overturning his/her policies to combat a pandemic" = "Obey courts",
         "Says that a prime minister should comply with with court decisions overturning his/her policies to combat a pandemic" = "Obey courts",
         "Says that a prime minister should ignore court decisions overturning his/her policies to combat a pandemic" = "Ignore courts",
         "Says that a prime minister should shut down Parliament if it is obstructing his/her policies to combat a pandemic" = "Shut down Parliament",
         "Says that a prime minister should work with Parliament even if it is obstructing his/her policies to combat a pandemic" = "Work with Parliament")) %>% 
  mutate(Gender = factor(Gender, levels = c("Male", "Female")),
         Age = factor(Age, levels = c("37", "39", "43", "45", "52", "57", "61", "66", "71", "75")),
         `Economic aid policy` = factor(`Economic aid policy`, levels = c("Mostly to businesses", "Mostly to workers", "Basic income")),
         `Lockdown policy` = factor(`Lockdown policy`, levels = c("Lockdown ended now", "Lockdown until fewer deaths", 
                                              "Lockdown until vaccine")),
         `Political experience` = factor(`Political experience`, levels = c("No political experience", "Mayor", "Member of provincial legislature", "Member of Parliament")),
         Party = factor(Party)) %>% 
  dplyr::select(id, task, profile, choice, selected, Gender:Democracy, everything())

# A dummy variable that indicates whether a candidate is for continuing lockdown
conjoint$pro_lockdown_cand <- ifelse(conjoint$`Lockdown policy` %in% c("Lockdown until fewer deaths", 
                                                        "Lockdown until vaccine"), 1, 0)

# Only data for congress attribute
# Getting rid of unused levels and releving democratic attribute
conjoint_parliament <- filter(conjoint, 
                            str_detect(Democracy, "Parliament")) %>% 
  droplevels() %>% 
  mutate(Democracy = factor(Democracy, levels = c("Work with Parliament", "Shut down Parliament"))) %>% 
  dplyr::rename(`Legislative checks` = Democracy)

# Only data for courts attribute
conjoint_courts <- filter(conjoint, 
                            str_detect(Democracy, "court")) %>% 
  droplevels() %>% 
  mutate(Democracy = factor(Democracy, levels = c("Obey courts", "Ignore courts"))) %>% 
  dplyr::rename(`Judicial checks` = Democracy)

# On which row was the democracy attribute?
# When the "order" embedded data field was set to 6, it's because the javascript selected the 6th 
# element of the vector of features. 
conjoint_parliament$democracy_row <- NA
conjoint_parliament$democracy_row <- ifelse(conjoint_parliament$order1 == 6, "5", conjoint_parliament$democracy_row)
conjoint_parliament$democracy_row <- ifelse(conjoint_parliament$order2 == 6, "6", conjoint_parliament$democracy_row)
conjoint_parliament$democracy_row <- ifelse(conjoint_parliament$order3 == 6, "7", conjoint_parliament$democracy_row)


# On which row was the democracy attribute?
conjoint_courts$democracy_row <- NA
conjoint_courts$democracy_row <- ifelse(conjoint_courts$order1 == 6, "5", conjoint_courts$democracy_row)
conjoint_courts$democracy_row <- ifelse(conjoint_courts$order2 == 6, "6", conjoint_courts$democracy_row)
conjoint_courts$democracy_row <- ifelse(conjoint_courts$order3 == 6, "7", conjoint_courts$democracy_row)
```

## Task time

```{r}
conjoint_courts %<>%
  pivot_longer(time_1:time_6,
               values_to = "task_time",
               names_to = "task2") %>% 
  mutate(task2 = strsplit(task2, "_") %>% lapply("[", 2) %>% unlist() %>% as.double(),
         task_time = as.numeric(task_time)) %>% 
  filter(task == task2) %>% 
  dplyr::select(-task2)

conjoint_parliament %<>%
  pivot_longer(time_1:time_6,
               values_to = "task_time",
               names_to = "task2") %>% 
  mutate(task2 = strsplit(task2, "_") %>% lapply("[", 2) %>% unlist() %>% as.double(),
         task_time = as.numeric(task_time)) %>% 
  filter(task == task2) %>% 
  dplyr::select(-task2)
```

## Unnecessary columns in conjoint datasets

```{r}
conjoint_courts %<>% 
  dplyr::select(id, RecordedDate, duration, task, task_time, profile, selected, 
                Gender:`Judicial checks`, partyid, anxious1_think, 
                anxious2_activities, anxiety_4cat, covid_lockdowns, 
                pro_lockdown_cand, democracy_row)

conjoint_parliament %<>% 
  dplyr::select(id, RecordedDate, duration, task, task_time, profile, selected, 
                Gender:`Legislative checks`, partyid, anxious1_think, 
                anxious2_activities, anxiety_4cat, covid_lockdowns, 
                pro_lockdown_cand, democracy_row)
```


# Unnecessary columns in survey dataset

```{r}
survey %<>% 
  dplyr::select(
    # used to identify straightliners 
    -uniques, -support_1:-attn_check, -worry_1:-worry_8, -straightliner,
    # keeping just RecordedDate
    -StartDate:-EndDate,
    # already used to subset; no variation
    -can_citizen, -consent,
    # order of democracy row
    -order1:-order3,
    # conjoint data we don't need in this dataframe
    -traits1a:-traits6b, -conjoint_1:-time_6, 
    # used to create authoritarianism scale
    -authority1_isna:-authority_full
  )
```


# Saving data

```{r}
save(survey, conjoint_parliament, conjoint_courts,
     file = "data/can_covid_clean.RData")
```